June 12, 2015
library(rvest)
src <- html("http://en.wikipedia.org/wiki/Table_(information)")
node <- html_node(src, css = ".wikitable")
".wikitable" is a CSS selector which says: "grab nodes (aka elements) with a class of wikitable".html_table() converts a single <table> node to a data frame.html_table(node)
## First name Last name Age ## 1 Bielat Adamczak 24 ## 2 Blaszczyk Kostrzewski 25 ## 3 Olatunkboh Chijiaku 22 ## 4 Adrienne Anthoula 22 ## 5 Axelia Athanasios 22 ## 6 Jon-Kabat Zinn 22
html("http://en.wikipedia.org/wiki/Table_(information)") %>%
html_node(".wikitable") %>% html_table()
## First name Last name Age ## 1 Bielat Adamczak 24 ## 2 Blaszczyk Kostrzewski 25 ## 3 Olatunkboh Chijiaku 22 ## 4 Adrienne Anthoula 22 ## 5 Axelia Athanasios 22 ## 6 Jon-Kabat Zinn 22
Navigate this page and try the following:
Easy: Grab the table at the bottom of the page (hint: instead of grabbing a node by class with html_node(".class"), you can grab by id with html_node("#id"))
Medium: Grab the actual mean, max, and min temperature.
Hard: Grab the weather history graph and write the figure to disk (download.file() may be helpful here).
(Kudos to Hadley Wickham for the example)
See here for a solution
<table> data?domain <- "http://www.sec.gov"
susp <- file.path(domain, "litigation/suspensions.shtml")
hrefs <- html(susp) %>% html_nodes("p+ table a") %>% html_attr(name = "href")
tail(hrefs)
## [1] "/litigation/suspensions/2015/34-74326.pdf" ## [2] "/litigation/suspensions/2015/34-74326-o.pdf" ## [3] "/litigation/suspensions/2015/34-74324.pdf" ## [4] "/litigation/suspensions/2015/34-74324-o.pdf" ## [5] "/litigation/suspensions/2015/34-74218.pdf" ## [6] "/litigation/suspensions/2015/34-74218-o.pdf"
# download all the pdfs! hrefs <- hrefs[!is.na(hrefs)] pdfs <- paste0(domain, hrefs) mapply(download.file, pdfs, basename(pdfs))
Nativigate to Wikipedia's list of data structures use SelectorGadget + rvest to do the following:
See here for a solution.
# returns the HTML page source, which is _static_ (no table!)
rvest::html("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/")
## <!DOCTYPE html>
## <html><body>
## A Simple Table made with JavaScript
## <p></p>
## <script>
## function tableCreate(){
## var body = document.body,
## tbl = document.createElement('table');
##
## for(var i = 0; i < 3; i++){
## var tr = tbl.insertRow();
## for(var j = 0; j < 3; j++){
## var td = tr.insertCell();
## td.appendChild(document.createTextNode("Cell"));
## }
## }
## body.appendChild(tbl);
## }
## tableCreate();
## </script>
## </body></html>
##
rdom can construct the DOM:
library(rdom)
rdom("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/") %>%
html_node("table") %>% html_table()
X1 X2 X3 1 Cell Cell Cell 2 Cell Cell Cell 3 Cell Cell Cell
You can give rdom() CSS Selectors directly to avoid sending the entire DOM from phantomjs to R
rdom("http://www.techstars.com/companies/stats/", "table") %>%
html_table()
X1 X2 X3 1 Cell Cell Cell 2 Cell Cell Cell 3 Cell Cell Cell
GET.library(httr)
hadley <- GET("https://api.github.com/users/hadley")
content(hadley)[c("name", "company")]
## $name ## [1] "Hadley Wickham" ## ## $company ## [1] "RStudio"
POST, PUT, DELETE, etc…
hadley <- httr::GET("https://api.github.com/users/hadley")
hadley$header[1:3]
## $server ## [1] "GitHub.com" ## ## $date ## [1] "Fri, 12 Jun 2015 16:10:34 GMT" ## ## $`content-type` ## [1] "application/json; charset=utf-8"
XML is a markup language that looks very similar to HTML.
<mariokart>
<driver name="Bowser" occupation="Koopa">
<vehicle speed="55" weight="25"> Wario Bike </vehicle>
<vehicle speed="40" weight="67"> Piranha Prowler </vehicle>
</driver>
<driver name="Peach" occupation="Princess">
<vehicle speed="54" weight="29"> Royal Racer </vehicle>
<vehicle speed="50" weight="34"> Wild Wing </vehicle>
</driver>
</mariokart>
XML2R is a framework to simplify acquistion of tabular/relational XML.
library(XML2R)
obs <- XML2Obs("http://bit.ly/mario-xml")
table(names(obs))
## ## mariokart//driver mariokart//driver//vehicle ## 2 4
obs # named list of observations
## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "55" "25" " Wario Bike " ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "40" "67" " Piranha Prowler " ## ## $`mariokart//driver` ## name occupation ## [1,] "Bowser" "Koopa" ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "54" "29" " Royal Racer " ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "50" "34" " Wild Wing " ## ## $`mariokart//driver` ## name occupation ## [1,] "Peach" "Princess"
collapse_obs(obs) # group into table(s) by observational name/unit
## $`mariokart//driver` ## name occupation ## [1,] "Bowser" "Koopa" ## [2,] "Peach" "Princess" ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "55" "25" " Wario Bike " ## [2,] "40" "67" " Piranha Prowler " ## [3,] "54" "29" " Royal Racer " ## [4,] "50" "34" " Wild Wing "
obs <- add_key(obs, parent = "mariokart//driver", recycle = "name") collapse_obs(obs)
## $`mariokart//driver` ## name occupation ## [1,] "Bowser" "Koopa" ## [2,] "Peach" "Princess" ## ## $`mariokart//driver//vehicle` ## speed weight XML_value name ## [1,] "55" "25" " Wario Bike " "Bowser" ## [2,] "40" "67" " Piranha Prowler " "Bowser" ## [3,] "54" "29" " Royal Racer " "Peach" ## [4,] "50" "34" " Wild Wing " "Peach"
Now (if I want) I can merge the tables into a single table…
tabs <- collapse_obs(obs) merge(tabs[[1]], tabs[[2]], by = "name")
## name occupation speed weight XML_value ## 1 Bowser Koopa 55 25 Wario Bike ## 2 Bowser Koopa 40 67 Piranha Prowler ## 3 Peach Princess 54 29 Royal Racer ## 4 Peach Princess 50 34 Wild Wing
[
{
"driver": "Bowser",
"occupation": "Koopa",
"vehicles": [
{
"model": "Wario Bike",
"speed": 55,
"weight": 25
},
{
"model": "Piranha Prowler",
"speed": 40,
"weight": 67
}
]
},
{
"driver": "Peach",
"occupation": "Princess",
"vehicles": [
{
"model": "Royal Racer",
"speed": 54,
"weight": 29
},
{
"model": "Wild Wing",
"speed": 50,
"weight": 34
}
]
}
]
library(jsonlite)
mario <- fromJSON("http://bit.ly/mario-json")
str(mario) # nested data.frames?!?
## 'data.frame': 2 obs. of 3 variables: ## $ driver : chr "Bowser" "Peach" ## $ occupation: chr "Koopa" "Princess" ## $ vehicles :List of 2 ## ..$ :'data.frame': 2 obs. of 3 variables: ## .. ..$ model : chr "Wario Bike" "Piranha Prowler" ## .. ..$ speed : int 55 40 ## .. ..$ weight: int 25 67 ## ..$ :'data.frame': 2 obs. of 3 variables: ## .. ..$ model : chr "Royal Racer" "Wild Wing" ## .. ..$ speed : int 54 50 ## .. ..$ weight: int 29 34
mario$driver
## [1] "Bowser" "Peach"
mario$vehicles
## [[1]] ## model speed weight ## 1 Wario Bike 55 25 ## 2 Piranha Prowler 40 67 ## ## [[2]] ## model speed weight ## 1 Royal Racer 54 29 ## 2 Wild Wing 50 34
How do we get two tables (with a common id) like the XML example?
# this mapply statement is essentially equivalent to add_key
vehicles <- mapply(function(x, y) cbind(x, driver = y),
mario$vehicles, mario$driver, SIMPLIFY = FALSE)
Reduce(rbind, vehicles)
## model speed weight driver ## 1 Wario Bike 55 25 Bowser ## 2 Piranha Prowler 40 67 Bowser ## 3 Royal Racer 54 29 Peach ## 4 Wild Wing 50 34 Peach
mario[!grepl("vehicle", names(mario))]
## driver occupation ## 1 Bowser Koopa ## 2 Peach Princess